{
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## 简单的数据探索\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {},
   "outputs": [],
   "source": [
    "import pandas as pd"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 19,
   "metadata": {},
   "outputs": [],
   "source": [
    "data = pd.read_csv(\"../data/training.csv\")\n",
    "data.columns = [\"type\",\"text\"]\n",
    "\n",
    "data[\"type\"] = data[\"type\"]. map(lambda s: int(s) - 1)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 20,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "<class 'pandas.core.frame.DataFrame'>\n",
      "RangeIndex: 4773 entries, 0 to 4772\n",
      "Data columns (total 2 columns):\n",
      "type    4773 non-null int64\n",
      "text    4773 non-null object\n",
      "dtypes: int64(1), object(1)\n",
      "memory usage: 74.7+ KB\n"
     ]
    }
   ],
   "source": [
    "data.info()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 21,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>type</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>count</th>\n",
       "      <td>4773.000000</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>mean</th>\n",
       "      <td>4.070186</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>std</th>\n",
       "      <td>2.286824</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>min</th>\n",
       "      <td>0.000000</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>25%</th>\n",
       "      <td>2.000000</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>50%</th>\n",
       "      <td>3.000000</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>75%</th>\n",
       "      <td>5.000000</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>max</th>\n",
       "      <td>10.000000</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "              type\n",
       "count  4773.000000\n",
       "mean      4.070186\n",
       "std       2.286824\n",
       "min       0.000000\n",
       "25%       2.000000\n",
       "50%       3.000000\n",
       "75%       5.000000\n",
       "max      10.000000"
      ]
     },
     "execution_count": 21,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "data.describe()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 22,
   "metadata": {},
   "outputs": [],
   "source": [
    "from matplotlib import pyplot\n",
    "import seaborn as sns\n",
    "%matplotlib inline"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 23,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "image/png": "iVBORw0KGgoAAAANSUhEUgAAAY4AAAEKCAYAAAAFJbKyAAAABHNCSVQICAgIfAhkiAAAAAlwSFlzAAALEgAACxIB0t1+/AAAADl0RVh0U29mdHdhcmUAbWF0cGxvdGxpYiB2ZXJzaW9uIDMuMC4yLCBodHRwOi8vbWF0cGxvdGxpYi5vcmcvOIA7rQAAF8lJREFUeJzt3XuUZWV95vHvYwOKSGwuFRZ2k3RHGSNjHCU9iGKMI15ADO2oEIkXUAxmRAclSwUHxYyTiU5UvCQSGSG2ESHYoBDFC0HQNcni0giRu7QI0i2XUhB6ZBTB3/xx3oJD25ezu+qcU0V9P2vVOnu/+z3799aCVU/v27tTVUiSNKhHjXsAkqS5xeCQJHVicEiSOjE4JEmdGBySpE4MDklSJwaHJKkTg0OS1InBIUnqZKtxD2AYdt5551qyZMm4hyFJc8pll13246qa2Fy/R2RwLFmyhFWrVo17GJI0pyS5eZB+nqqSJHVicEiSOjE4JEmdGBySpE4MDklSJwaHJKkTg0OS1InBIUnqxOCQJHXyiHxyfL778in7D73GS9/w1Q22n/D5Fw+17tv/5OtD3b+kzfOIQ5LUicEhSerE4JAkdWJwSJI6MTgkSZ0YHJKkTgwOSVInQwuOJKckuSPJVX1tf53kuiTfTfLFJAv7th2bZHWS65O8uK99v9a2OskxwxqvJGkwwzzi+Ayw33pt5wFPraqnAd8DjgVIsgfwKuDft+98MsmCJAuAvwX2B/YADml9JUljMrTgqKpvA3eu1/aNqrq/rV4ELG7Ly4HTq+oXVfUDYDWwV/tZXVU3VtV9wOmtryRpTMZ5jeMNwNS8FYuAW/q2rWltG2v/NUmOSLIqyarJyckhDFeSBGMKjiT/DbgfOHWm9llVJ1XVsqpaNjExMVO7lSStZ+STHCY5DHgpsG9VVWteC+zW121xa2MT7ZKkMRjpEUeS/YB3AgdW1b19m84BXpXk0UmWArsDlwCXArsnWZpkG3oX0M8Z5ZglSQ83tCOOJKcBzwN2TrIGOJ7eXVSPBs5LAnBRVf1ZVV2d5AzgGnqnsI6sqgfaft4CfB1YAJxSVVcPa8ySpM0bWnBU1SEbaD55E/3/EvjLDbSfC5w7g0OTJE2DT45LkjoxOCRJnRgckqRODA5JUicGhySpE4NDktSJwSFJ6sTgkCR1YnBIkjoxOCRJnRgckqRODA5JUicGhySpE4NDktSJwSFJ6sTgkCR1YnBIkjoxOCRJnRgckqRODA5JUicGhySpE4NDktSJwSFJ6mRowZHklCR3JLmqr23HJOcluaF97tDak+TjSVYn+W6SPfu+c2jrf0OSQ4c1XknSYIZ5xPEZYL/12o4Bzq+q3YHz2zrA/sDu7ecI4EToBQ1wPPBMYC/g+KmwkSSNx9CCo6q+Ddy5XvNyYEVbXgG8rK/9s9VzEbAwya7Ai4HzqurOqroLOI9fDyNJ0giN+hrHLlV1a1u+DdilLS8Cbunrt6a1baxdkjQmY7s4XlUF1EztL8kRSVYlWTU5OTlTu5UkrWfUwXF7OwVF+7yjta8Fduvrt7i1baz911TVSVW1rKqWTUxMzPjAJUk9ow6Oc4CpO6MOBc7ua39du7tqb+Dudkrr68CLkuzQLoq/qLVJksZkq2HtOMlpwPOAnZOsoXd31AeAM5IcDtwMHNy6nwu8BFgN3Au8HqCq7kzyfuDS1u+/V9X6F9wlSSM0tOCoqkM2smnfDfQt4MiN7OcU4JQZHJokaRp8clyS1InBIUnqxOCQJHVicEiSOjE4JEmdGBySpE4MDklSJwaHJKkTg0OS1MlmgyPJQUm2b8vHJTmr/w19kqT5ZZAjjvdU1bokzwFeAJxMe0OfJGn+GSQ4HmifBwAnVdVXgG2GNyRJ0mw2SHCsTfIp4I+Bc5M8esDvSZIegQYJgIPpvQPjxVX1U2BH4B1DHZUkadbabHBU1b303tT3nNZ0P3DDMAclSZq9Brmr6njgXcCxrWlr4HPDHJQkafYa5FTVfwYOBH4GUFU/ArYf5qAkSbPXIMFxX3tDXwEk2W64Q5IkzWaDBMcZ7a6qhUn+FPhn4H8Pd1iSpNlqs+8cr6oPJXkhcA/w74D3VtV5Qx+ZJGlW2mxwNFcC29I7XXXl8IYjSZrtBrmr6o3AJcDLgVcCFyV5w7AHJkmanQY54ngH8Iyq+glAkp2AfwVOGebAJEmz0yAXx38CrOtbX9faJEnz0CDBsRq4OMn72sOAFwHfS3J0kqO3pGiStye5OslVSU5L8pgkS5NcnGR1kn9Msk3r++i2vrptX7IlNSVJM2OQ4Pg+8CXacxzA2cAP6D0E2PlBwCSLgP8KLKuqpwILgFcBHwROqKonAXcBh7evHA7c1dpPaP0kSWMyyDWOs6pqpu+k2grYNskvgccCtwLPB/6kbV8BvI/eez+Wt2WAlcDfJEl7KFEauwPO/NRQ9/+VV7xpqPuXuhrkiOOTSS5J8uYkj59uwapaC3wI+CG9wLgbuAz4aVXd37qtARa15UXALe2797f+O62/3yRHJFmVZNXk5OR0hylJ2ohBZsf9A+A1wG7AZUk+n+RFW1owyQ70jiKWAk8AtgP229L99Y3zpKpaVlXLJiYmprs7SdJGDPRCpqr6HnAcvVly/xD4WJLrkrx8C2q+APhBVU1W1S+Bs4B96E1pMnXqbDGwti2vpRdatO2Px7u6JGlsBnkA8GlJTgCupXcd4o+q6ilt+YQtqPlDYO8kj00SYF/gGuACeg8YAhxK7yI8wDltnbb9m17fkKTxGeTi+CeATwPvrqr/N9VYVT9KclzXglV1cZKVwHfovRTqcuAk4CvA6Un+R2s7uX3lZOAfkqwG7qR3B5YkaUwGCY4vVtU/9DckOaqqPrZ++6Cq6njg+PWabwT22kDfnwMHbUkdSdLMG+Qax+s20HbYDI9DkjRHbPSII8kh9J6rWJrknL5N29M7ZSRJmoc2darqX+k9Z7Ez8OG+9nXAd4c5KEnS7LXR4Kiqm4GbgWeNbjiSpNluoOc4JEmaYnBIkjrZaHAkOb99OhutJOlBm7o4vmuSZwMHJjkdSP/GqvrOUEcmSZqVNhUc7wXeQ2/eqI+st63oTTkiSZpnNnVX1UpgZZL3VNX7RzgmSdIsttkpR6rq/UkOBJ7bmi6sqi8Pd1iSpNlqkNlx/wo4it4MttcARyX5n8MemCRpdhpkksMDgKdX1a8AkqygN3vtu4c5MEnS7DTocxwL+5an/fpYSdLcNcgRx18Blye5gN4tuc8FjhnqqCRJs9YgF8dPS3Ih8B9b07uq6rahjkqSNGsNcsRBVd1K7xWukqR5zrmqJEmdGBySpE42GRxJFiS5blSDkSTNfpsMjqp6ALg+yW+NaDySpFlukIvjOwBXJ7kE+NlUY1UdOLRRSZJmrUGC4z1DH4Ukac7Y7MXxqvoWcBOwdVu+FJjWuziSLEyyMsl1Sa5N8qwkOyY5L8kN7XOH1jdJPp5kdZLvJtlzOrUlSdMzyCSHfwqsBD7VmhYBX5pm3Y8BX6uq3wX+A3AtvafRz6+q3YHzeejp9P2B3dvPEcCJ06wtSZqGQW7HPRLYB7gHoKpuAH5zSwsmeTy9aUtObvu7r6p+CiwHVrRuK4CXteXlwGer5yJgYZJdt7S+JGl6BgmOX1TVfVMrSbai9wbALbUUmAT+PsnlST6dZDtgl/aEOsBtwC5teRFwS9/317Q2SdIYDBIc30rybmDbJC8EvgD80zRqbgXsCZxYVc+gd6fWwyZNrKqiYzglOSLJqiSrJicnpzE8SdKmDBIcx9A7QrgSeBNwLnDcNGquAdZU1cVtfSW9ILl96hRU+7yjbV8L7Nb3/cWt7WGq6qSqWlZVyyYmJqYxPEnSpgwyO+6v2subLqZ3FHB9OyLYIlV1W5Jbkjy5qq4H9uWhtwseCnygfZ7dvnIO8JYkpwPPBO7uO6UlSRqxzQZHkgOAvwO+T+99HEuTvKmqvjqNum8FTk2yDXAj8Hp6Rz9nJDkcuBk4uPU9F3gJsBq4t/WVJI3JIA8Afhj4T1W1GiDJE4GvAFscHFV1BbBsA5v23UDfondnlyRpFhjkGse6qdBobgTWDWk8kqRZbqNHHEle3hZXJTkXOIPeNY6D6D09LkmahzZ1quqP+pZvB/6wLU8C2w5tRJKkWW2jwVFVXoSWJP2aQe6qWkrvLqgl/f2dVl2S5qdB7qr6Er15pf4J+NVwhyNJmu0GCY6fV9XHhz4SSdKcMEhwfCzJ8cA3gF9MNVbVtN7JIUmamwYJjt8DXgs8n4dOVVVblyTNM4MEx0HA7/RPrS5Jmr8GeXL8KmDhsAciSZobBjniWAhcl+RSHn6Nw9txJWkeGiQ4jh/6KCRJc8Yg7+P41igGIkmaGwZ5cnwdD73GdRtga+BnVfUbwxyYJGl2GuSIY/up5SQBlgN7D3NQkqTZa5C7qh5UPV8CXjyk8UiSZrlBTlW9vG/1UfTe3PfzoY1IkjSrDXJXVf97Oe4HbqJ3ukqSNA8Nco3D93JIkh60qVfHvncT36uqev8QxiNJmuU2dcTxsw20bQccDuwEGBySNA9t6tWxH55aTrI9cBTweuB04MMb+54k6ZFtk9c4kuwIHA28GlgB7FlVd41iYJKk2Wmjz3Ek+WvgUmAd8HtV9b6ZDI0kC5JcnuTLbX1pkouTrE7yj0m2ae2Pbuur2/YlMzUGSVJ3m3oA8M+BJwDHAT9Kck/7WZfknhmofRRwbd/6B4ETqupJwF30rqXQPu9q7Se0fpKkMdlocFTVo6pq26ravqp+o+9n++nOU5VkMXAA8Om2HnpvFFzZuqwAXtaWl7d12vZ9W39J0hh0mnJkBn0UeCcPvYp2J+CnVXV/W18DLGrLi4BbANr2u1t/SdIYjDw4krwUuKOqLpvh/R6RZFWSVZOTkzO5a0lSn3EccewDHJjkJnq39j4f+BiwMMnUXV6LgbVteS2wG0Db/njgJ+vvtKpOqqplVbVsYmJiuL+BJM1jIw+Oqjq2qhZX1RLgVcA3q+rVwAXAK1u3Q4Gz2/I5bZ22/ZtVVUiSxmJc1zg25F3A0UlW07uGcXJrPxnYqbUfDRwzpvFJkhhsdtyhqaoLgQvb8o3AXhvo83PgoJEOTJK0UbPpiEOSNAcYHJKkTgwOSVInBockqRODQ5LUicEhSerE4JAkdWJwSJI6MTgkSZ0YHJKkTsY65YikLXfgyrM332kaznnl8qHuX3OXRxySpE4MDklSJwaHJKkTg0OS1IkXxyVpM6775O1Dr/G7b95l6DVmikcckqRODA5JUicGhySpE4NDktSJwSFJ6sTgkCR1YnBIkjrxOQ49Iux/9p8NvcZXl//d0GtIc8HIjziS7JbkgiTXJLk6yVGtfcck5yW5oX3u0NqT5ONJVif5bpI9Rz1mSdJDxnGq6n7gz6tqD2Bv4MgkewDHAOdX1e7A+W0dYH9g9/ZzBHDi6IcsSZoy8lNVVXUrcGtbXpfkWmARsBx4Xuu2ArgQeFdr/2xVFXBRkoVJdm37kTQGB5151VD3/4VXPHWo+9f0jPXieJIlwDOAi4Fd+sLgNmBq4pZFwC19X1vT2tbf1xFJViVZNTk5ObQxS9J8N7bgSPI44EzgbVV1T/+2dnRRXfZXVSdV1bKqWjYxMTGDI5Uk9RtLcCTZml5onFpVZ7Xm25Ps2rbvCtzR2tcCu/V9fXFrkySNwTjuqgpwMnBtVX2kb9M5wKFt+VDg7L7217W7q/YG7vb6hiSNzzie49gHeC1wZZIrWtu7gQ8AZyQ5HLgZOLhtOxd4CbAauBd4/WiHK0nqN467qv4PkI1s3ncD/Qs4cqiDkiQNzCfHJc0ZZ5z546Hu/+BX7DzU/T9SOFeVJKkTg0OS1InBIUnqxOCQJHVicEiSOjE4JEmdGBySpE4MDklSJwaHJKkTg0OS1IlTjgzR9z+xfKj7f+Jbz958J0lz2u0fvWSo+9/lbXt1/o5HHJKkTgwOSVInBockqRODQ5LUicEhSerE4JAkdfKIvx138sTPDb3GxH95zdBrSNJs4RGHJKkTg0OS1InBIUnqxOCQJHUyZ4IjyX5Jrk+yOskx4x6PJM1XcyI4kiwA/hbYH9gDOCTJHuMdlSTNT3MiOIC9gNVVdWNV3QecDgx36llJ0gbNleBYBNzSt76mtUmSRixVNe4xbFaSVwL7VdUb2/prgWdW1Vv6+hwBHNFWnwxcP42SOwM/nsb351rdcdaeb3XHWdvfeX7Unk7d366qic11mitPjq8FdutbX9zaHlRVJwEnzUSxJKuqatlM7Gsu1B1n7flWd5y1/Z3nR+1R1J0rp6ouBXZPsjTJNsCrgHPGPCZJmpfmxBFHVd2f5C3A14EFwClVdfWYhyVJ89KcCA6AqjoXOHdE5WbklNccqjvO2vOt7jhr+zvPj9pDrzsnLo5LkmaPuXKNQ5I0SxgcfcY1rUmSU5LckeSqUdVsdXdLckGSa5JcneSoEdZ+TJJLkvxbq/0Xo6rd6i9IcnmSL4+w5k1JrkxyRZJVo6rbai9MsjLJdUmuTfKsEdR8cvtdp37uSfK2Ydftq//29v/WVUlOS/KYEdU9qtW8eti/74b+diTZMcl5SW5onzvMeOGq8qd3um4B8H3gd4BtgH8D9hhR7ecCewJXjfh33hXYsy1vD3xvhL9zgMe15a2Bi4G9R/i7Hw18HvjyCGveBOw8yv/GfbVXAG9sy9sAC0dcfwFwG73nBEZRbxHwA2Dbtn4GcNgI6j4VuAp4LL1ryP8MPGmI9X7tbwfwv4Bj2vIxwAdnuq5HHA8Z27QmVfVt4M5R1Fqv7q1V9Z22vA64lhE9kV89/7etbt1+RnLBLcli4ADg06OoN25JHk/vD8zJAFV1X1X9dMTD2Bf4flXdPMKaWwHbJtmK3h/yH42g5lOAi6vq3qq6H/gW8PJhFdvI347l9P6hQPt82UzXNTgeMq+nNUmyBHgGvX/5j6rmgiRXAHcA51XVqGp/FHgn8KsR1ZtSwDeSXNZmOhiVpcAk8Pft9Nynk2w3wvrQe/bqtFEVq6q1wIeAHwK3AndX1TdGUPoq4A+S7JTkscBLePjDy6OwS1Xd2pZvA3aZ6QIGh0jyOOBM4G1Vdc+o6lbVA1X1dHozAeyV5KnDrpnkpcAdVXXZsGttwHOqak96szwfmeS5I6q7Fb3TGSdW1TOAn9E7hTES7aHdA4EvjLDmDvT+5b0UeAKwXZLXDLtuVV0LfBD4BvA14ArggWHX3cR4iiEcyRscD9nstCaPREm2phcap1bVWeMYQzttcgGw3wjK7QMcmOQmeqcjn5/kcyOoO/WvYKrqDuCL9E6PjsIaYE3fEd1KekEyKvsD36mq20dY8wXAD6pqsqp+CZwFPHsUhavq5Kr6/ap6LnAXvWuHo3R7kl0B2ucdM13A4HjIvJvWJEnonfe+tqo+MuLaE0kWtuVtgRcC1w27blUdW1WLq2oJvf/G36yqof9LNMl2SbafWgZeRO+0xtBV1W3ALUme3Jr2Ba4ZRe3mEEZ4mqr5IbB3kse2/8/3pXcNb+iS/Gb7/C161zc+P4q6fc4BDm3LhwJnz3SBOfPk+LDVGKc1SXIa8Dxg5yRrgOOr6uQRlN4HeC1wZbvWAPDu6j2lP2y7AivaS7oeBZxRVSO7NXYMdgG+2PsbxlbA56vqayOs/1bg1PaPohuB14+iaAvJFwJvGkW9KVV1cZKVwHeA+4HLGd2T3Gcm2Qn4JXDkMG9E2NDfDuADwBlJDgduBg6e8brtli1JkgbiqSpJUicGhySpE4NDktSJwSFJ6sTgkCR1YnBIM6DNPvvmcY9DGgWDQ5oZCwGDQ/OCwSHNjA8AT2zvnfhCkgdnJE1yapLlSQ5LcnaSC9u7Eo7v6/Oa9n6SK5J8qj0YKc1KBoc0M46hN23404G/AQ6DB6c0fzbwldZvL+AVwNOAg5IsS/IU4I+Bfdr3HwBePdrhS4NzyhFphlXVt5J8MskEvZA4s01pA73p438CkOQs4Dn0psT4feDS1mdbhjAxnTRTDA5pOD4LvIbeRIr980KtP8dP0Xsb4oqqOnZEY5OmxVNV0sxYR+/1u1M+A7wNoKr6Z6J9YXsn9Lb03sz2L8D5wCv7ZlXdMclvj2TU0hbwiEOaAVX1kyT/kuQq4KtV9Y4k1wJfWq/rJfTef7IY+FxVrQJIchy9twM+ijarKr2ZTaVZx9lxpSForw29Etizqu5ubYcBy6rqLeMcmzRdnqqSZliSF9B7adAnpkJDeiTxiEOS1IlHHJKkTgwOSVInBockqRODQ5LUicEhSerE4JAkdfL/AdDxytb6KEsrAAAAAElFTkSuQmCC\n",
      "text/plain": [
       "<Figure size 432x288 with 1 Axes>"
      ]
     },
     "metadata": {
      "needs_background": "light"
     },
     "output_type": "display_data"
    }
   ],
   "source": [
    "sns.countplot(data[\"type\"]);\n",
    "pyplot.xlabel('type');\n",
    "pyplot.ylabel('Number of types');"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## 总结\n",
    "1.  类型是从1开始的，最好改成从0开始计算\n",
    "2. 各个类型的分类很不均，集中在2，3，5分类, 在模型校验的时候要设置权重"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.7.1"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}
